logo头像
Snippet 博客主题

Hadoop学习之路(十)HDFS API的使用

** Hadoop学习之路(十)HDFS API的使用:** <Excerpt in index | 首页摘要>

​ Hadoop学习之路(十)HDFS API的使用

<The rest of contents | 余下全文>

HDFS API的高级编程

HDFS的API就两个:FileSystem 和Configuration

1、文件的上传和下载

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
 1 package com.ghgj.hdfs.api;
2
3 import org.apache.hadoop.conf.Configuration;
4 import org.apache.hadoop.fs.FileSystem;
5 import org.apache.hadoop.fs.Path;
6
7 public class HDFS_GET_AND_PUT {
8
9 public static void main(String[] args) throws Exception {
10
11
12 Configuration conf = new Configuration();
13 conf.set("fs.defaultFS", "hdfs://hadoop1:9000");
14 conf.set("dfs.replication", "2");
15 FileSystem fs = FileSystem.get(conf);
16
17
18 /**
19 * 更改操作用户有两种方式:
20 *
21 * 1、直接设置运行换种的用户名为hadoop
22 *
23 * VM arguments ; -DHADOOP_USER_NAME=hadoop
24 *
25 * 2、在代码中进行声明
26 *
27 * System.setProperty("HADOOP_USER_NAME", "hadoop");
28 */
29 System.setProperty("HADOOP_USER_NAME", "hadoop");
30
31 // 上传
32 fs.copyFromLocalFile(new Path("c:/sss.txt"), new Path("/a/ggg.txt"));
33
34
35
36 /**
37 * .crc : 校验文件
38 *
39 * 每个块的元数据信息都只会记录合法数据的起始偏移量: qqq.txt blk_41838 : 0 - 1100byte
40 *
41 * 如果进行非法的数据追加。最终是能够下载合法数据。
42 * 由于你在数据的中间, 也就是说在 0 -1100 之间的范围进行了数据信息的更改。 造成了采用CRC算法计算出来校验值,和最初存入进HDFS的校验值
43 * 不一致。HDFS就认为当前这个文件被损坏了。
44 */
45
46
47 // 下载
48 fs.copyToLocalFile(new Path("/a/qqq.txt"), new Path("c:/qqq3.txt"));
49
50
51 /**
52 * 上传和下载的API的底层封装其实就是 : FileUtil.copy(....)
53 */
54
55 fs.close();
56 }
57 }

2、配置文件conf

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
 1 package com.exam.hdfs;
2
3 import java.io.IOException;
4 import java.util.Iterator;
5 import java.util.Map.Entry;
6
7 import org.apache.hadoop.conf.Configuration;
8 import org.apache.hadoop.fs.FileSystem;
9
10 public class TestConf1 {
11
12 public static void main(String[] args) throws Exception {
13
14
15 /**
16 * 底层会加载一堆的配置文件:
17 *
18 * core-default.xml
19 * hdfs-default.xml
20 * mapred-default.xml
21 * yarn-default.xml
22 */
23 Configuration conf = new Configuration();
24 // conf.addResource("hdfs-default.xml");
25
26 /**
27 * 当前这个hdfs-site.xml文件就放置在这个项目中的src下。也就是classpath路径下。
28 * 所以 FS在初始化的时候,会把hdfs-site.xml这个文件中的name-value对解析到conf中
29 *
30 *
31 * 但是:
32 *
33 * 1、如果hdfs-site.xml 不在src下, 看是否能加载??? 不能
34 *
35 * 2、如果文件名不叫做 hdfs-default.xml 或者 hdsf-site.xml 看是否能自动加载??? 不能
36 *
37 * 得出的结论:
38 *
39 * 如果需要项目代码自动加载配置文件中的信息,那么就必须把配置文件改成-default.xml或者-site.xml的名称
40 * 而且必须放置在src下
41 *
42 * 那如果不叫这个名,或者不在src下,也需要加载这些配置文件中的参数:
43 *
44 * 必须使用conf对象提供的一些方法去手动加载
45 */
46 // conf.addResource("hdfs-site.xml");
47 conf.set("dfs.replication", "1");
48 conf.addResource("myconfig/hdfs-site.xml");
49
50
51 /**
52 * 依次加载的参数信息的顺序是:
53 *
54 * 1、加载 core/hdfs/mapred/yarn-default.xml
55 *
56 * 2、加载通过conf.addResources()加载的配置文件
57 *
58 * 3、加载conf.set(name, value)
59 */
60
61 FileSystem fs = FileSystem.get(conf);
62
63 System.out.println(conf.get("dfs.replication"));
64
65
66 Iterator<Entry<String, String>> iterator = conf.iterator();
67 while(iterator.hasNext()){
68 Entry<String, String> e = iterator.next();
69 System.out.println(e.getKey() + "\t" + e.getValue());
70 }
71 }
72 }

输出结果

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
  1 log4j:WARN No appenders could be found for logger (org.apache.hadoop.metrics2.lib.MutableMetricsFactory).
2 log4j:WARN Please initialize the log4j system properly.
3 log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
4 1
5 hadoop.security.groups.cache.secs 300
6 dfs.datanode.cache.revocation.timeout.ms 900000
7 dfs.namenode.resource.check.interval 5000
8 s3.client-write-packet-size 65536
9 dfs.client.https.need-auth false
10 dfs.replication 1
11 hadoop.security.group.mapping.ldap.directory.search.timeout 10000
12 dfs.datanode.available-space-volume-choosing-policy.balanced-space-threshold 10737418240
13 hadoop.work.around.non.threadsafe.getpwuid false
14 dfs.namenode.write-lock-reporting-threshold-ms 5000
15 fs.ftp.host.port 21
16 dfs.namenode.avoid.read.stale.datanode false
17 dfs.journalnode.rpc-address 0.0.0.0:8485
18 hadoop.security.kms.client.encrypted.key.cache.expiry 43200000
19 ipc.client.connection.maxidletime 10000
20 hadoop.registry.zk.session.timeout.ms 60000
21 tfile.io.chunk.size 1048576
22 fs.automatic.close true
23 ha.health-monitor.sleep-after-disconnect.ms 1000
24 io.map.index.interval 128
25 dfs.namenode.https-address 0.0.0.0:50470
26 dfs.mover.max-no-move-interval 60000
27 io.seqfile.sorter.recordlimit 1000000
28 fs.s3n.multipart.uploads.enabled false
29 hadoop.util.hash.type murmur
30 dfs.namenode.replication.min 1
31 dfs.datanode.directoryscan.threads 1
32 dfs.namenode.fs-limits.min-block-size 1048576
33 dfs.datanode.directoryscan.interval 21600
34 fs.AbstractFileSystem.file.impl org.apache.hadoop.fs.local.LocalFs
35 dfs.namenode.acls.enabled false
36 dfs.client.short.circuit.replica.stale.threshold.ms 1800000
37 net.topology.script.number.args 100
38 hadoop.http.authentication.token.validity 36000
39 fs.s3.block.size 67108864
40 dfs.namenode.resource.du.reserved 104857600
41 ha.failover-controller.graceful-fence.rpc-timeout.ms 5000
42 s3native.bytes-per-checksum 512
43 dfs.namenode.datanode.registration.ip-hostname-check true
44 dfs.namenode.path.based.cache.block.map.allocation.percent 0.25
45 dfs.namenode.backup.http-address 0.0.0.0:50105
46 hadoop.security.group.mapping org.apache.hadoop.security.JniBasedUnixGroupsMappingWithFallback
47 dfs.namenode.edits.noeditlogchannelflush false
48 dfs.datanode.cache.revocation.polling.ms 500
49 dfs.namenode.audit.loggers default
50 hadoop.security.groups.cache.warn.after.ms 5000
51 io.serializations org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization,org.apache.hadoop.io.serializer.avro.AvroReflectSerialization
52 dfs.namenode.lazypersist.file.scrub.interval.sec 300
53 fs.s3a.threads.core 15
54 hadoop.security.crypto.buffer.size 8192
55 hadoop.http.cross-origin.allowed-methods GET,POST,HEAD
56 hadoop.registry.zk.retry.interval.ms 1000
57 dfs.http.policy HTTP_ONLY
58 hadoop.registry.secure false
59 dfs.namenode.replication.interval 3
60 dfs.namenode.safemode.min.datanodes 0
61 dfs.client.file-block-storage-locations.num-threads 10
62 nfs.dump.dir /tmp/.hdfs-nfs
63 dfs.namenode.secondary.https-address 0.0.0.0:50091
64 hadoop.kerberos.kinit.command kinit
65 dfs.block.access.token.lifetime 600
66 dfs.webhdfs.enabled true
67 dfs.client.use.datanode.hostname false
68 dfs.namenode.delegation.token.max-lifetime 604800000
69 fs.trash.interval 0
70 dfs.datanode.drop.cache.behind.writes false
71 dfs.namenode.avoid.write.stale.datanode false
72 dfs.namenode.num.extra.edits.retained 1000000
73 s3.blocksize 67108864
74 ipc.client.connect.max.retries.on.timeouts 45
75 dfs.datanode.data.dir /home/hadoop/data/hadoopdata/data
76 fs.s3.buffer.dir ${hadoop.tmp.dir}/s3
77 fs.s3n.block.size 67108864
78 nfs.exports.allowed.hosts * rw
79 ha.health-monitor.connect-retry-interval.ms 1000
80 hadoop.security.instrumentation.requires.admin false
81 hadoop.registry.zk.retry.ceiling.ms 60000
82 nfs.rtmax 1048576
83 dfs.client.mmap.cache.size 256
84 dfs.datanode.data.dir.perm 700
85 io.file.buffer.size 4096
86 dfs.namenode.backup.address 0.0.0.0:50100
87 dfs.client.datanode-restart.timeout 30
88 dfs.datanode.readahead.bytes 4194304
89 dfs.namenode.xattrs.enabled true
90 io.mapfile.bloom.size 1048576
91 ipc.client.connect.retry.interval 1000
92 dfs.client-write-packet-size 65536
93 dfs.namenode.checkpoint.txns 1000000
94 dfs.datanode.bp-ready.timeout 20
95 dfs.datanode.transfer.socket.send.buffer.size 131072
96 hadoop.security.kms.client.authentication.retry-count 1
97 dfs.client.block.write.retries 3
98 fs.swift.impl org.apache.hadoop.fs.swift.snative.SwiftNativeFileSystem
99 ha.failover-controller.graceful-fence.connection.retries 1
100 hadoop.registry.zk.connection.timeout.ms 15000
101 dfs.namenode.safemode.threshold-pct 0.999f
102 dfs.cachereport.intervalMsec 10000
103 hadoop.security.java.secure.random.algorithm SHA1PRNG
104 ftp.blocksize 67108864
105 dfs.namenode.list.cache.directives.num.responses 100
106 dfs.namenode.kerberos.principal.pattern *
107 file.stream-buffer-size 4096
108 dfs.datanode.dns.nameserver default
109 fs.s3a.max.total.tasks 1000
110 dfs.namenode.replication.considerLoad true
111 nfs.allow.insecure.ports true
112 dfs.namenode.edits.journal-plugin.qjournal org.apache.hadoop.hdfs.qjournal.client.QuorumJournalManager
113 dfs.client.write.exclude.nodes.cache.expiry.interval.millis 600000
114 dfs.client.mmap.cache.timeout.ms 3600000
115 ipc.client.idlethreshold 4000
116 io.skip.checksum.errors false
117 ftp.stream-buffer-size 4096
118 fs.s3a.fast.upload false
119 dfs.client.failover.connection.retries.on.timeouts 0
120 file.blocksize 67108864
121 ftp.replication 3
122 dfs.namenode.replication.work.multiplier.per.iteration 2
123 hadoop.security.authorization false
124 hadoop.http.authentication.simple.anonymous.allowed true
125 s3native.client-write-packet-size 65536
126 hadoop.rpc.socket.factory.class.default org.apache.hadoop.net.StandardSocketFactory
127 file.bytes-per-checksum 512
128 dfs.datanode.slow.io.warning.threshold.ms 300
129 fs.har.impl.disable.cache true
130 rpc.engine.org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolPB org.apache.hadoop.ipc.ProtobufRpcEngine
131 io.seqfile.lazydecompress true
132 dfs.namenode.reject-unresolved-dn-topology-mapping false
133 hadoop.common.configuration.version 0.23.0
134 hadoop.security.authentication simple
135 dfs.datanode.drop.cache.behind.reads false
136 dfs.image.compression.codec org.apache.hadoop.io.compress.DefaultCodec
137 dfs.client.read.shortcircuit.streams.cache.size 256
138 file.replication 1
139 dfs.namenode.top.num.users 10
140 dfs.namenode.accesstime.precision 3600000
141 dfs.namenode.fs-limits.max-xattrs-per-inode 32
142 dfs.image.transfer.timeout 60000
143 io.mapfile.bloom.error.rate 0.005
144 nfs.wtmax 1048576
145 hadoop.security.kms.client.encrypted.key.cache.size 500
146 dfs.namenode.edit.log.autoroll.check.interval.ms 300000
147 fs.s3a.multipart.purge false
148 dfs.namenode.support.allow.format true
149 hadoop.hdfs.configuration.version 1
150 fs.s3a.connection.establish.timeout 5000
151 hadoop.security.group.mapping.ldap.search.attr.member member
152 dfs.secondary.namenode.kerberos.internal.spnego.principal ${dfs.web.authentication.kerberos.principal}
153 dfs.stream-buffer-size 4096
154 hadoop.ssl.client.conf ssl-client.xml
155 dfs.namenode.invalidate.work.pct.per.iteration 0.32f
156 fs.s3a.multipart.purge.age 86400
157 dfs.journalnode.https-address 0.0.0.0:8481
158 dfs.namenode.top.enabled true
159 hadoop.security.kms.client.encrypted.key.cache.low-watermark 0.3f
160 dfs.namenode.max.objects 0
161 hadoop.user.group.static.mapping.overrides dr.who=;
162 fs.s3a.fast.buffer.size 1048576
163 dfs.bytes-per-checksum 512
164 dfs.datanode.max.transfer.threads 4096
165 dfs.block.access.key.update.interval 600
166 ipc.maximum.data.length 67108864
167 tfile.fs.input.buffer.size 262144
168 ha.failover-controller.new-active.rpc-timeout.ms 60000
169 dfs.client.cached.conn.retry 3
170 dfs.client.read.shortcircuit false
171 hadoop.ssl.hostname.verifier DEFAULT
172 dfs.datanode.hdfs-blocks-metadata.enabled false
173 dfs.datanode.directoryscan.throttle.limit.ms.per.sec 0
174 dfs.image.transfer.chunksize 65536
175 hadoop.http.authentication.type simple
176 dfs.namenode.list.encryption.zones.num.responses 100
177 dfs.client.https.keystore.resource ssl-client.xml
178 s3native.blocksize 67108864
179 net.topology.impl org.apache.hadoop.net.NetworkTopology
180 dfs.client.failover.sleep.base.millis 500
181 io.seqfile.compress.blocksize 1000000
182 dfs.namenode.path.based.cache.refresh.interval.ms 30000
183 dfs.namenode.decommission.interval 30
184 dfs.permissions.superusergroup supergroup
185 dfs.namenode.fs-limits.max-directory-items 1048576
186 hadoop.registry.zk.retry.times 5
187 dfs.ha.log-roll.period 120
188 fs.AbstractFileSystem.ftp.impl org.apache.hadoop.fs.ftp.FtpFs
189 ftp.bytes-per-checksum 512
190 dfs.user.home.dir.prefix /user
191 dfs.namenode.checkpoint.edits.dir ${dfs.namenode.checkpoint.dir}
192 dfs.client.socket.send.buffer.size 131072
193 ipc.client.fallback-to-simple-auth-allowed false
194 dfs.blockreport.initialDelay 0
195 dfs.namenode.inotify.max.events.per.rpc 1000
196 dfs.namenode.heartbeat.recheck-interval 300000
197 dfs.namenode.safemode.extension 30000
198 dfs.client.failover.sleep.max.millis 15000
199 dfs.namenode.delegation.key.update-interval 86400000
200 dfs.datanode.transfer.socket.recv.buffer.size 131072
201 hadoop.rpc.protection authentication
202 fs.permissions.umask-mode 022
203 fs.s3.sleepTimeSeconds 10
204 dfs.namenode.fs-limits.max-xattr-size 16384
205 ha.health-monitor.rpc-timeout.ms 45000
206 hadoop.http.staticuser.user dr.who
207 dfs.datanode.http.address 0.0.0.0:50075
208 fs.s3a.connection.maximum 15
209 fs.s3a.paging.maximum 5000
210 fs.AbstractFileSystem.viewfs.impl org.apache.hadoop.fs.viewfs.ViewFs
211 dfs.namenode.blocks.per.postponedblocks.rescan 10000
212 fs.ftp.host 0.0.0.0
213 dfs.lock.suppress.warning.interval 10s
214 hadoop.http.authentication.kerberos.keytab ${user.home}/hadoop.keytab
215 fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem
216 hadoop.registry.zk.root /registry
217 hadoop.jetty.logs.serve.aliases true
218 dfs.namenode.fs-limits.max-blocks-per-file 1048576
219 dfs.balancer.keytab.enabled false
220 dfs.client.block.write.replace-datanode-on-failure.enable true
221 hadoop.http.cross-origin.max-age 1800
222 io.compression.codec.bzip2.library system-native
223 dfs.namenode.checkpoint.dir file://${hadoop.tmp.dir}/dfs/namesecondary
224 dfs.client.use.legacy.blockreader.local false
225 dfs.namenode.top.windows.minutes 1,5,25
226 ipc.ping.interval 60000
227 net.topology.node.switch.mapping.impl org.apache.hadoop.net.ScriptBasedMapping
228 nfs.mountd.port 4242
229 dfs.storage.policy.enabled true
230 dfs.namenode.list.cache.pools.num.responses 100
231 fs.df.interval 60000
232 nfs.server.port 2049
233 ha.zookeeper.parent-znode /hadoop-ha
234 hadoop.http.cross-origin.allowed-headers X-Requested-With,Content-Type,Accept,Origin
235 dfs.datanode.block-pinning.enabled false
236 dfs.namenode.num.checkpoints.retained 2
237 fs.s3a.attempts.maximum 10
238 s3native.stream-buffer-size 4096
239 io.seqfile.local.dir ${hadoop.tmp.dir}/io/local
240 fs.s3n.multipart.copy.block.size 5368709120
241 dfs.encrypt.data.transfer.cipher.key.bitlength 128
242 dfs.client.mmap.retry.timeout.ms 300000
243 dfs.datanode.sync.behind.writes false
244 dfs.namenode.fslock.fair true
245 hadoop.ssl.keystores.factory.class org.apache.hadoop.security.ssl.FileBasedKeyStoresFactory
246 dfs.permissions.enabled true
247 fs.AbstractFileSystem.hdfs.impl org.apache.hadoop.fs.Hdfs
248 dfs.blockreport.split.threshold 1000000
249 dfs.datanode.balance.bandwidthPerSec 1048576
250 dfs.block.scanner.volume.bytes.per.second 1048576
251 hadoop.security.random.device.file.path /dev/urandom
252 fs.s3.maxRetries 4
253 hadoop.http.filter.initializers org.apache.hadoop.http.lib.StaticUserWebFilter
254 dfs.namenode.stale.datanode.interval 30000
255 ipc.client.rpc-timeout.ms 0
256 fs.client.resolve.remote.symlinks true
257 dfs.default.chunk.view.size 32768
258 hadoop.ssl.enabled.protocols TLSv1
259 dfs.namenode.decommission.blocks.per.interval 500000
260 dfs.namenode.handler.count 10
261 dfs.image.transfer.bandwidthPerSec 0
262 rpc.metrics.quantile.enable false
263 hadoop.ssl.enabled false
264 dfs.replication.max 512
265 dfs.namenode.name.dir /home/hadoop/data/hadoopdata/name
266 dfs.namenode.read-lock-reporting-threshold-ms 5000
267 dfs.datanode.https.address 0.0.0.0:50475
268 dfs.datanode.failed.volumes.tolerated 0
269 ipc.client.kill.max 10
270 fs.s3a.threads.max 256
271 ipc.server.listen.queue.size 128
272 dfs.client.domain.socket.data.traffic false
273 dfs.block.access.token.enable false
274 dfs.blocksize 134217728
275 fs.s3a.connection.timeout 50000
276 fs.s3a.threads.keepalivetime 60
277 file.client-write-packet-size 65536
278 dfs.datanode.address 0.0.0.0:50010
279 ha.failover-controller.cli-check.rpc-timeout.ms 20000
280 ha.zookeeper.acl world:anyone:rwcda
281 ipc.client.connect.max.retries 10
282 dfs.encrypt.data.transfer false
283 dfs.namenode.write.stale.datanode.ratio 0.5f
284 ipc.client.ping true
285 dfs.datanode.shared.file.descriptor.paths /dev/shm,/tmp
286 dfs.short.circuit.shared.memory.watcher.interrupt.check.ms 60000
287 hadoop.tmp.dir /home/hadoop/data/hadoopdata
288 dfs.datanode.handler.count 10
289 dfs.client.failover.max.attempts 15
290 dfs.balancer.max-no-move-interval 60000
291 dfs.client.read.shortcircuit.streams.cache.expiry.ms 300000
292 dfs.namenode.block-placement-policy.default.prefer-local-node true
293 hadoop.ssl.require.client.cert false
294 hadoop.security.uid.cache.secs 14400
295 dfs.client.read.shortcircuit.skip.checksum false
296 dfs.namenode.resource.checked.volumes.minimum 1
297 hadoop.registry.rm.enabled false
298 dfs.namenode.quota.init-threads 4
299 dfs.namenode.max.extra.edits.segments.retained 10000
300 dfs.webhdfs.user.provider.user.pattern ^[A-Za-z_][A-Za-z0-9._-]*[$]?$
301 dfs.client.mmap.enabled true
302 dfs.client.file-block-storage-locations.timeout.millis 1000
303 dfs.datanode.block.id.layout.upgrade.threads 12
304 dfs.datanode.use.datanode.hostname false
305 hadoop.fuse.timer.period 5
306 dfs.client.context default
307 fs.trash.checkpoint.interval 0
308 dfs.journalnode.http-address 0.0.0.0:8480
309 dfs.balancer.address 0.0.0.0:0
310 dfs.namenode.lock.detailed-metrics.enabled false
311 dfs.namenode.delegation.token.renew-interval 86400000
312 ha.health-monitor.check-interval.ms 1000
313 dfs.namenode.retrycache.heap.percent 0.03f
314 ipc.client.connect.timeout 20000
315 dfs.reformat.disabled false
316 dfs.blockreport.intervalMsec 21600000
317 fs.s3a.multipart.threshold 2147483647
318 dfs.https.server.keystore.resource ssl-server.xml
319 hadoop.http.cross-origin.enabled false
320 io.map.index.skip 0
321 dfs.balancer.block-move.timeout 0
322 io.native.lib.available true
323 s3.replication 3
324 dfs.namenode.kerberos.internal.spnego.principal ${dfs.web.authentication.kerberos.principal}
325 fs.AbstractFileSystem.har.impl org.apache.hadoop.fs.HarFs
326 hadoop.security.kms.client.encrypted.key.cache.num.refill.threads 2
327 fs.s3n.multipart.uploads.block.size 67108864
328 dfs.image.compress false
329 dfs.datanode.dns.interface default
330 dfs.datanode.available-space-volume-choosing-policy.balanced-space-preference-fraction 0.75f
331 tfile.fs.output.buffer.size 262144
332 fs.du.interval 600000
333 dfs.client.failover.connection.retries 0
334 dfs.namenode.edit.log.autoroll.multiplier.threshold 2.0
335 hadoop.security.group.mapping.ldap.ssl false
336 dfs.namenode.top.window.num.buckets 10
337 fs.s3a.buffer.dir ${hadoop.tmp.dir}/s3a
338 dfs.namenode.checkpoint.check.period 60
339 fs.defaultFS hdfs://hadoop1:9000
340 fs.s3a.multipart.size 104857600
341 dfs.client.slow.io.warning.threshold.ms 30000
342 dfs.datanode.max.locked.memory 0
343 dfs.namenode.retrycache.expirytime.millis 600000
344 hadoop.security.group.mapping.ldap.search.attr.group.name cn
345 dfs.client.block.write.replace-datanode-on-failure.best-effort false
346 dfs.ha.fencing.ssh.connect-timeout 30000
347 dfs.datanode.scan.period.hours 504
348 hadoop.registry.zk.quorum localhost:2181
349 dfs.namenode.fs-limits.max-component-length 255
350 hadoop.http.cross-origin.allowed-origins *
351 dfs.namenode.enable.retrycache true
352 dfs.datanode.du.reserved 0
353 dfs.datanode.ipc.address 0.0.0.0:50020
354 hadoop.registry.system.acls sasl:yarn@, sasl:mapred@, sasl:hdfs@
355 dfs.namenode.path.based.cache.retry.interval.ms 30000
356 hadoop.security.crypto.cipher.suite AES/CTR/NoPadding
357 dfs.client.block.write.replace-datanode-on-failure.policy DEFAULT
358 dfs.namenode.http-address 0.0.0.0:50070
359 hadoop.security.crypto.codec.classes.aes.ctr.nopadding org.apache.hadoop.crypto.OpensslAesCtrCryptoCodec,org.apache.hadoop.crypto.JceAesCtrCryptoCodec
360 dfs.ha.tail-edits.period 60
361 hadoop.security.groups.negative-cache.secs 30
362 hadoop.ssl.server.conf ssl-server.xml
363 hadoop.registry.jaas.context Client
364 s3native.replication 3
365 hadoop.security.group.mapping.ldap.search.filter.group (objectClass=group)
366 hadoop.http.authentication.kerberos.principal HTTP/_HOST@LOCALHOST
367 dfs.namenode.startup.delay.block.deletion.sec 0
368 hadoop.security.group.mapping.ldap.search.filter.user (&(objectClass=user)(sAMAccountName={0}))
369 dfs.namenode.edits.dir ${dfs.namenode.name.dir}
370 dfs.namenode.checkpoint.max-retries 3
371 s3.stream-buffer-size 4096
372 ftp.client-write-packet-size 65536
373 dfs.datanode.fsdatasetcache.max.threads.per.volume 4
374 hadoop.security.sensitive-config-keys password$,fs.s3.*[Ss]ecret.?[Kk]ey,fs.azure.account.key.*,dfs.webhdfs.oauth2.[a-z]+.token,hadoop.security.sensitive-config-keys
375 dfs.namenode.decommission.max.concurrent.tracked.nodes 100
376 dfs.namenode.name.dir.restore false
377 ipc.server.log.slow.rpc false
378 dfs.heartbeat.interval 3
379 dfs.namenode.secondary.http-address hadoop3:50090
380 ha.zookeeper.session-timeout.ms 5000
381 s3.bytes-per-checksum 512
382 fs.s3a.connection.ssl.enabled true
383 hadoop.http.authentication.signature.secret.file ${user.home}/hadoop-http-auth-signature-secret
384 hadoop.fuse.connection.timeout 300
385 dfs.namenode.checkpoint.period 3600
386 ipc.server.max.connections 0
387 dfs.ha.automatic-failover.enabled false

3、列出指定目录下的文件以及块的信息

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
 1 package com.exam.hdfs;
2
3 import org.apache.hadoop.conf.Configuration;
4 import org.apache.hadoop.fs.BlockLocation;
5 import org.apache.hadoop.fs.FileSystem;
6 import org.apache.hadoop.fs.LocatedFileStatus;
7 import org.apache.hadoop.fs.Path;
8 import org.apache.hadoop.fs.RemoteIterator;
9
10 public class TestHDFS1 {
11
12 public static void main(String[] args) throws Exception {
13
14 Configuration conf = new Configuration();
15 System.setProperty("HADOOP_USER_NAME", "hadoop");
16 conf.set("fs.defaultFS", "hdfs://hadoop1:9000");
17 FileSystem fs = FileSystem.get(conf);
18
19 /**
20 * 列出指定的目录下的所有文件
21 */
22 RemoteIterator<LocatedFileStatus> listFiles = fs.listFiles(new Path("/"), true);
23 while(listFiles.hasNext()){
24 LocatedFileStatus file = listFiles.next();
25
26
27 System.out.println(file.getPath()+"\t");
28 System.out.println(file.getPath().getName()+"\t");
29 System.out.println(file.getLen()+"\t");
30 System.out.println(file.getReplication()+"\t");
31
32 /**
33 * blockLocations的长度是几? 是什么意义?
34 *
35 * 块的数量
36 */
37 BlockLocation[] blockLocations = file.getBlockLocations();
38 System.out.println(blockLocations.length+"\t");
39
40 for(BlockLocation bl : blockLocations){
41 String[] hosts = bl.getHosts();
42
43 System.out.print(hosts[0] + "-" + hosts[1]+"\t");
44 }
45 System.out.println();
46
47 }
48
49
50 }
51 }

输出结果

1
2
3
4
5
6
1 hdfs://hadoop1:9000/aa/bb/cc/hadoop.tar.gz    
2 hadoop.tar.gz
3 199007110
4 2
5 3
6 hadoop3-hadoop1 hadoop1-hadoop2 hadoop1-hadoop4

4、上传文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
 1 package com.exam.hdfs;
2
3 import java.io.File;
4 import java.io.FileInputStream;
5 import java.io.InputStream;
6
7 import org.apache.hadoop.conf.Configuration;
8 import org.apache.hadoop.fs.FSDataOutputStream;
9 import org.apache.hadoop.fs.FileSystem;
10 import org.apache.hadoop.fs.Path;
11 import org.apache.hadoop.io.IOUtils;
12
13 public class UploadDataByStream {
14
15 public static void main(String[] args) throws Exception {
16
17
18 Configuration conf = new Configuration();
19 System.setProperty("HADOOP_USER_NAME", "hadoop");
20 conf.set("fs.defaultFS", "hdfs://hadoop1:9000");
21 FileSystem fs = FileSystem.get(conf);
22
23
24 InputStream in = new FileInputStream(new File("d:/abc.tar.gz"));
25 FSDataOutputStream out = fs.create(new Path("/aa/abc.tar.gz"));
26
27
28 IOUtils.copyBytes(in, out, 4096, true);
29
30 fs.close();
31
32 }
33 }

5、下载文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
 1 package com.exam.hdfs;
2
3 import java.io.File;
4 import java.io.FileOutputStream;
5 import java.io.OutputStream;
6
7 import org.apache.hadoop.conf.Configuration;
8 import org.apache.hadoop.fs.FSDataInputStream;
9 import org.apache.hadoop.fs.FileSystem;
10 import org.apache.hadoop.fs.Path;
11 import org.apache.hadoop.io.IOUtils;
12
13 public class DownloadDataByStream {
14
15
16 public static void main(String[] args) throws Exception {
17
18 Configuration conf = new Configuration();
19 System.setProperty("HADOOP_USER_NAME", "hadoop");
20 conf.set("fs.defaultFS", "hdfs://hadoop1:9000");
21 FileSystem fs = FileSystem.get(conf);
22
23
24 FSDataInputStream in = fs.open(new Path("/aa/abc.tar.gz"));
25 OutputStream out = new FileOutputStream(new File("D:/abc.sh"));
26
27
28 IOUtils.copyBytes(in, out, 4096, true);
29
30 fs.close();
31
32 }
33 }

6、删除某个路径下特定类型的文件,比如class类型文件,比如txt类型文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
 1 package com.exam.hdfs;
2
3 import java.net.URI;
4
5 import org.apache.hadoop.conf.Configuration;
6 import org.apache.hadoop.fs.FileStatus;
7 import org.apache.hadoop.fs.FileSystem;
8 import org.apache.hadoop.fs.Path;
9
10 public class HDFS_DELETE_CLASS {
11
12 public static final String FILETYPE = "tar.gz";
13 public static final String DELETE_PATH = "/aa";
14
15 public static void main(String[] args) throws Exception {
16
17 new HDFS_DELETE_CLASS().rmrClassFile(new Path(DELETE_PATH));
18 }
19
20 public void rmrClassFile(Path path) throws Exception{
21
22 // 首先获取集群必要的信息,以得到FileSystem的示例对象fs
23 Configuration conf = new Configuration();
24 FileSystem fs = FileSystem.get(new URI("hdfs://hadoop1:9000"), conf, "hadoop");
25
26 // 首先检查path本身是文件夹还是目录
27 FileStatus fileStatus = fs.getFileStatus(path);
28 boolean directory = fileStatus.isDirectory();
29
30 // 根据该目录是否是文件或者文件夹进行相应的操作
31 if(directory){
32 // 如果是目录
33 checkAndDeleteDirectory(path, fs);
34 }else{
35 // 如果是文件,检查该文件名是不是FILETYPE类型的文件
36 checkAndDeleteFile(path, fs);
37 }
38 }
39
40 // 处理目录
41 public static void checkAndDeleteDirectory(Path path, FileSystem fs) throws Exception{
42 // 查看该path目录下一级子目录和子文件的状态
43 FileStatus[] listStatus = fs.listStatus(path);
44 for(FileStatus fStatus: listStatus){
45 Path p = fStatus.getPath();
46 // 如果是文件,并且是以FILETYPE结尾,则删掉,否则继续遍历下一级目录
47 if(fStatus.isFile()){
48 checkAndDeleteFile(p, fs);
49 }else{
50 checkAndDeleteDirectory(p, fs);
51 }
52 }
53 }
54
55 // 檢查文件是否符合刪除要求,如果符合要求則刪除,不符合要求则不做处理
56 public static void checkAndDeleteFile(Path path, FileSystem fs) throws Exception{
57 String name = path.getName();
58 System.out.println(name);
59 /*// 直接判断有没有FILETYPE这个字符串,不是特别稳妥,并且会有误操作,所以得判断是不是以FILETYPE结尾
60 if(name.indexOf(FILETYPE) != -1){
61 fs.delete(path, true);
62 }*/
63 // 判断是不是以FILETYPE结尾
64 int startIndex = name.length() - FILETYPE.length();
65 int endIndex = name.length();
66 // 求得文件后缀名
67 String fileSuffix = name.substring(startIndex, endIndex);
68 if(fileSuffix.equals(FILETYPE)){
69 fs.delete(path, true);
70 }
71 }
72 }

7、删除HDFS集群中的所有空文件和空目录

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
  1 public class DeleteEmptyDirAndFile {
2
3 static FileSystem fs = null;
4
5 public static void main(String[] args) throws Exception {
6
7 initFileSystem();
8
9 // 创建测试数据
10 // makeTestData();
11
12 // 删除测试数据
13 // deleteTestData();
14
15 // 删除指定文件夹下的空文件和空文件夹
16 deleteEmptyDirAndFile(new Path("/aa"));
17 }
18
19 /**
20 * 删除指定文件夹下的 空文件 和 空文件夹
21 * @throws Exception
22 */
23 public static void deleteEmptyDirAndFile(Path path) throws Exception {
24
25 //当是空文件夹时
26 FileStatus[] listStatus = fs.listStatus(path);
27 if(listStatus.length == 0){
28 fs.delete(path, true);
29 return;
30 }
31
32 // 该方法的结果:包括指定目录的 文件 和 文件夹
33 RemoteIterator<LocatedFileStatus> listLocatedStatus = fs.listLocatedStatus(path);
34
35 while (listLocatedStatus.hasNext()) {
36 LocatedFileStatus next = listLocatedStatus.next();
37
38 Path currentPath = next.getPath();
39 // 获取父目录
40 Path parent = next.getPath().getParent();
41
42 // 如果是文件夹,继续往下遍历,删除符合条件的文件(空文件夹)
43 if (next.isDirectory()) {
44
45 // 如果是空文件夹
46 if(fs.listStatus(currentPath).length == 0){
47 // 删除掉
48 fs.delete(currentPath, true);
49 }else{
50 // 不是空文件夹,那么则继续遍历
51 if(fs.exists(currentPath)){
52 deleteEmptyDirAndFile(currentPath);
53 }
54 }
55
56 // 如果是文件
57 } else {
58 // 获取文件的长度
59 long fileLength = next.getLen();
60 // 当文件是空文件时, 删除
61 if(fileLength == 0){
62 fs.delete(currentPath, true);
63 }
64 }
65
66 // 当空文件夹或者空文件删除时,有可能导致父文件夹为空文件夹,
67 // 所以每次删除一个空文件或者空文件的时候都需要判断一下,如果真是如此,那么就需要把该文件夹也删除掉
68 int length = fs.listStatus(parent).length;
69 if(length == 0){
70 fs.delete(parent, true);
71 }
72 }
73 }
74
75 /**
76 * 初始化FileSystem对象之用
77 */
78 public static void initFileSystem() throws Exception{
79 Configuration conf = new Configuration();
80 System.setProperty("HADOOP_USER_NAME", "hadoop");
81 conf.addResource("config/core-site.xml");
82 conf.addResource("config/hdfs-site.xml");
83 fs = FileSystem.get(conf);
84 }
85
86 /**
87 * 创建 测试 数据之用
88 */
89 public static void makeTestData() throws Exception {
90
91 String emptyFilePath = "D:\\bigdata\\1704mr_test\\empty.txt";
92 String notEmptyFilePath = "D:\\bigdata\\1704mr_test\\notEmpty.txt";
93
94 // 空文件夹 和 空文件 的目录
95 String path1 = "/aa/bb1/cc1/dd1/";
96 fs.mkdirs(new Path(path1));
97 fs.mkdirs(new Path("/aa/bb1/cc1/dd2/"));
98 fs.copyFromLocalFile(new Path(emptyFilePath), new Path(path1));
99 fs.copyFromLocalFile(new Path(notEmptyFilePath), new Path(path1));
100
101 // 空文件 的目录
102 String path2 = "/aa/bb1/cc2/dd2/";
103 fs.mkdirs(new Path(path2));
104 fs.copyFromLocalFile(new Path(emptyFilePath), new Path(path2));
105
106 // 非空文件 的目录
107 String path3 = "/aa/bb2/cc3/dd3";
108 fs.mkdirs(new Path(path3));
109 fs.copyFromLocalFile(new Path(notEmptyFilePath), new Path(path3));
110
111 // 空 文件夹
112 String path4 = "/aa/bb2/cc4/dd4";
113 fs.mkdirs(new Path(path4));
114
115 System.out.println("测试数据创建成功");
116 }
117
118 /**
119 * 删除 指定文件夹
120 * @throws Exception
121 */
122 public static void deleteTestData() throws Exception {
123 boolean delete = fs.delete(new Path("/aa"), true);
124 System.out.println(delete ? "删除数据成功" : "删除数据失败");
125 }
126
127 }

8、手动拷贝某个特定的数据块(比如某个文件的第二个数据块)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
 1 /**
2 * 手动拷贝某个特定的数据块(比如某个文件的第二个数据块)
3 * */
4 public static void copyBlock(String str,int num) {
5
6 Path path = new Path(str);
7
8 BlockLocation[] localtions = new BlockLocation[0] ;
9
10 try {
11 FileStatus fileStatus = fs.getFileStatus(path);
12
13 localtions = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
14
15 /*for(int i=0;i<localtions.length;i++) {
16 //0,134217728,hadoop1,hadoop3
17 //134217728,64789382,hadoop3,hadoop1
18 System.out.println(localtions[i]);
19 }*/
20
21 /*System.out.println(localtions[num-1].getOffset());
22 System.out.println(localtions[num-1].getLength());
23 String[] hosts = localtions[num-1].getHosts();*/
24
25 FSDataInputStream open = fs.open(path);
26 open.seek(localtions[num-1].getOffset());
27 OutputStream out = new FileOutputStream(new File("D:/abc.tar.gz"));
28 IOUtils.copyBytes(open, out,4096,true);
29
30
31
32 } catch (IOException e) {
33 e.printStackTrace();
34 }
35
36 }

9、编写程序统计出HDFS文件系统中文件大小小于HDFS集群中的默认块大小的文件占比

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
 1 import org.apache.hadoop.conf.Configuration;
2 import org.apache.hadoop.fs.FileSystem;
3 import org.apache.hadoop.fs.LocatedFileStatus;
4 import org.apache.hadoop.fs.Path;
5 import org.apache.hadoop.fs.RemoteIterator;
6
7 /**
8 *
9 * 编写程序统计出HDFS文件系统中文件大小小于HDFS集群中的默认块大小的文件占比
10 * 比如:大于等于128M的文件个数为98,小于128M的文件总数为2,所以答案是2%
11 */
12 public class Exam1_SmallFilePercent {
13
14 private static int DEFAULT_BLOCKSIZE = 128 * 1024 * 1024;
15
16 public static void main(String[] args) throws Exception {
17
18
19 Configuration conf = new Configuration();
20 conf.set("fs.defaultFS", "hdfs://hadoop1:9000");
21 System.setProperty("HADOOP_USER_NAME", "hadoop");
22 FileSystem fs = FileSystem.get(conf);
23
24
25 Path path = new Path("/");
26 float smallFilePercent = getSmallFilePercent(fs, path);
27 System.out.println(smallFilePercent);
28
29
30 fs.close();
31 }
32
33 /**
34 * 该方法求出指定目录下的小文件和总文件数的对比
35 * @throws Exception
36 */
37 private static float getSmallFilePercent(FileSystem fs, Path path) throws Exception {
38 // TODO Auto-generated method stub
39
40 int smallFile = 0;
41 int totalFile = 0;
42
43 RemoteIterator<LocatedFileStatus> listFiles = fs.listFiles(path, false);
44 while(listFiles.hasNext()){
45 totalFile++;
46 LocatedFileStatus next = listFiles.next();
47 long len = next.getLen();
48 if(len < DEFAULT_BLOCKSIZE){
49 smallFile++;
50 }
51 }
52 System.out.println(smallFile+" : "+totalFile);
53
54 return smallFile * 1f /totalFile;
55 }
56
57 }

10、编写程序统计出HDFS文件系统中的平均数据块数(数据块总数/文件总数)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
 1 import org.apache.hadoop.conf.Configuration;
2 import org.apache.hadoop.fs.FileSystem;
3 import org.apache.hadoop.fs.LocatedFileStatus;
4 import org.apache.hadoop.fs.Path;
5 import org.apache.hadoop.fs.RemoteIterator;
6
7 /**
8 *
9 * 编写程序统计出HDFS文件系统中的平均数据块数(数据块总数/文件总数)
10 * 比如:一个文件有5个块,一个文件有3个块,那么平均数据块数为4
11 * 如果还有一个文件,并且数据块就1个,那么整个HDFS的平均数据块数就是3
12 */
13 public class Exam2_HDSFAvgBlocks {
14
15 public static void main(String[] args) throws Exception {
16
17
18 Configuration conf = new Configuration();
19 conf.set("fs.defaultFS", "hdfs://hadoop1:9000");
20 System.setProperty("HADOOP_USER_NAME", "hadoop");
21 FileSystem fs = FileSystem.get(conf);
22
23
24 Path path = new Path("/");
25 float avgHDFSBlocks = getHDFSAvgBlocks(fs, path);
26 System.out.println("HDFS的平均数据块个数为:" + avgHDFSBlocks);
27
28
29 fs.close();
30 }
31
32 /**
33 * 求出指定目录下的所有文件的平均数据块个数
34 */
35 private static float getHDFSAvgBlocks(FileSystem fs, Path path) throws Exception {
36 // TODO Auto-generated method stub
37
38 int totalFiles = 0; // 总文件数
39 int totalBlocks = 0; // 总数据块数
40
41 RemoteIterator<LocatedFileStatus> listFiles = fs.listFiles(path, false);
42
43 while(listFiles.hasNext()){
44 LocatedFileStatus next = listFiles.next();
45 int length = next.getBlockLocations().length;
46 totalBlocks += length;
47 if(next.getLen() != 0){
48 totalFiles++;
49 }
50 }
51 System.out.println(totalBlocks+" : "+totalFiles);
52
53 return totalBlocks * 1f / totalFiles;
54 }
55
56 }

11、编写程序统计出HDFS文件系统中的平均副本数(副本总数/总数据块数)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
 1 import org.apache.hadoop.conf.Configuration;
2 import org.apache.hadoop.fs.FileSystem;
3 import org.apache.hadoop.fs.LocatedFileStatus;
4 import org.apache.hadoop.fs.Path;
5 import org.apache.hadoop.fs.RemoteIterator;
6
7 /**
8 * 编写程序统计出HDFS文件系统中的平均副本数(副本总数/总数据块数)
9 * 比如:总共两个文件,一个文件5个数据块,每个数据块3个副本,第二个文件2个数据块,每个文件2个副本,最终的平均副本数 = (3*3 + 2*2)/(3+2)= 2.8
10 */
11 public class Exam3_HDSFAvgBlockCopys {
12
13 public static void main(String[] args) throws Exception {
14
15
16 Configuration conf = new Configuration();
17 conf.set("fs.defaultFS", "hdfs://hadoop02:9000");
18 System.setProperty("HADOOP_USER_NAME", "hadoop");
19 FileSystem fs = FileSystem.get(conf);
20
21
22 Path path = new Path("/");
23 float avgHDFSBlockCopys = getHDFSAvgBlockCopys(fs, path);
24 System.out.println("HDFS的平均数据块个数为:" + avgHDFSBlockCopys);
25
26
27 fs.close();
28 }
29
30 /**
31 * 求出指定目录下的所有文件的平均数据块个数
32 */
33 private static float getHDFSAvgBlockCopys(FileSystem fs, Path path) throws Exception {
34 // TODO Auto-generated method stub
35
36 int totalCopy = 0; // 总副本数
37 int totalBlocks = 0; // 总数据块数
38
39 RemoteIterator<LocatedFileStatus> listFiles = fs.listFiles(path, false);
40
41 while(listFiles.hasNext()){
42 LocatedFileStatus next = listFiles.next();
43
44 int length = next.getBlockLocations().length;
45 short replication = next.getReplication();
46
47 totalBlocks += length;
48 totalCopy += length * replication;
49 }
50 System.out.println(totalCopy+" : "+totalBlocks);
51
52 return totalCopy * 1f / totalBlocks;
53 }
54
55 }

12、统计HDFS整个文件系统中的不足指定数据块大小的数据块的比例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
 1 import java.io.IOException;
2
3 import org.apache.hadoop.conf.Configuration;
4 import org.apache.hadoop.fs.BlockLocation;
5 import org.apache.hadoop.fs.FileSystem;
6 import org.apache.hadoop.fs.LocatedFileStatus;
7 import org.apache.hadoop.fs.Path;
8 import org.apache.hadoop.fs.RemoteIterator;
9
10 /**
11 * 统计HDFS整个文件系统中的不足指定数据块大小的数据块的比例
12 * 比如指定的数据块大小是128M,总数据块有100个,不是大小为完整的128M的数据块有5个,那么不足指定数据块大小的数据块的比例就为5%
13 * 注意:千万注意考虑不同文件的指定数据块大小可能不一致。所以千万不能用默认的128M一概而论
14 */
15 public class Exam4_LTBlockSize {
16
17 public static void main(String[] args) throws Exception {
18
19 Configuration conf = new Configuration();
20 conf.set("fs.defaultFS", "hdfs://hadoop02:9000");
21 System.setProperty("HADOOP_USER_NAME", "hadoop");
22 FileSystem fs = FileSystem.get(conf);
23
24 Path path = new Path("/");
25 float avgHDFSBlockCopys = getLessThanBlocksizeBlocks(fs, path);
26 System.out.println("HDFS的不足指定数据块大小的数据块数目为:" + avgHDFSBlockCopys);
27
28 fs.close();
29 }
30
31 private static float getLessThanBlocksizeBlocks(FileSystem fs, Path path) throws Exception {
32 // TODO Auto-generated method stub
33
34 int totalBlocks = 0; // 总副本数
35 int lessThenBlocksizeBlocks = 0; // 总数据块数
36
37 RemoteIterator<LocatedFileStatus> listFiles = fs.listFiles(path, false);
38
39 while(listFiles.hasNext()){
40 LocatedFileStatus next = listFiles.next();
41
42 BlockLocation[] blockLocations = next.getBlockLocations();
43 int length = blockLocations.length;
44
45 if(length != 0){
46 totalBlocks += length;
47 long lastBlockSize = blockLocations[length - 1].getLength();
48 long blockSize = next.getBlockSize();
49 if(lastBlockSize < blockSize){
50 lessThenBlocksizeBlocks++;
51 }
52 }
53 }
54 System.out.println(lessThenBlocksizeBlocks+" : "+totalBlocks);
55
56 return lessThenBlocksizeBlocks * 1f / totalBlocks;
57 }
58 }

13、统计出一个给定数组的蓄水总量(把数组的每个位置的数看是做地势高低)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
  1 /**
2 统计出一个给定数组的蓄水总量(把数组的每个位置的数看是做地势高低)
3 比如:int[] intArray = new int[]{4,3,2,5,6,4,4,7}
4 能蓄水:[0,1,2,0,0,2,2,0] 所以总量是:7
5
6 核心思路:把数组切成很多个 01数组,每一层一个01数组,统计每个01数组中的合法0的总个数(数组的左边第一个1的中间区间中的0的个数)即可
7 */
8 public class Exam5_WaterStoreOfArray {
9
10 public static void main(String[] args) {
11
12 // int[] intArray = new int[]{4,3,2,5,6,4,4,7};
13 // int[] intArray = new int[]{1,2,3,4,5,6};
14 int[] intArray = new int[]{3,1,2,7,3,8,4,9,5,6};
15
16 int totalWater = getArrayWater(intArray);
17 System.out.println(totalWater);
18 }
19
20 /**
21 * 求出数组中的水数
22 */
23 private static int getArrayWater(int[] intArray) {
24
25 int findMaxValueOfArray = findMaxValueOfArray(intArray);
26 int findMinValueOfArray = findMinValueOfArray(intArray);
27 int length = intArray.length;
28
29 int totalWater = 0;
30
31 // 循环次数就是最大值和最小值的差
32 for(int i=findMinValueOfArray; i<findMaxValueOfArray; i++){
33 // 循环构造每一层的01数组
34 int[] tempArray = new int[length];
35 for(int j=0; j<length; j++){
36 if(intArray[j] > i){
37 tempArray[j] = 1;
38 }else{
39 tempArray[j] = 0;
40 }
41 }
42 // 获取每一个01数组的合法0个数
43 int waterOfOneZeroArray = getWaterOfOneZeroArray(tempArray);
44 totalWater += waterOfOneZeroArray;
45 }
46 return totalWater;
47 }
48
49
50 /**
51 * 寻找逻辑是:从左右开始各找一个1,然后这两个1之间的所有0的个数,就是水数
52 */
53 private static int getWaterOfOneZeroArray(int[] tempArray) {
54
55 int length = tempArray.length;
56 int toatalWater = 0;
57
58 // 找左边的1
59 int i = 0;
60 while(i < length){
61 if(tempArray[i] == 1){
62 break;
63 }
64 i++;
65 }
66
67 // 从右边开始找1
68 int j=length-1;
69 while(j >= i){
70 if(tempArray[j] == 1){
71 break;
72 }
73 j--;
74 }
75
76 // 找以上两个1之间的0的个数。
77 if(i == j || i + 1 == j){
78 return 0;
79 }else{
80 for(int k=i+1; k<j; k++){
81 if(tempArray[k] == 0){
82 toatalWater++;
83 }
84 }
85 return toatalWater;
86 }
87 }
88
89 /**
90 *
91 * 描述:找出一个数组中的最大值
92 */
93 public static int findMaxValueOfArray(int[] intArray){
94 int length = intArray.length;
95 if(length == 0){
96 return 0;
97 }else if(length == 1){
98 return intArray[0];
99 }else{
100 int max = intArray[0];
101 for(int i=1; i<length; i++){
102 if(intArray[i] > max){
103 max = intArray[i];
104 }
105 }
106 return max;
107 }
108 }
109
110 /**
111 * 找出一个数组中的最小值
112 */
113 public static int findMinValueOfArray(int[] intArray){
114 int length = intArray.length;
115 if(length == 0){
116 return 0;
117 }else if(length == 1){
118 return intArray[0];
119 }else{
120 int min = intArray[0];
121 for(int i=1; i<length; i++){
122 if(intArray[i] < min){
123 min = intArray[i];
124 }
125 }
126 return min;
127 }
128 }
129 }